#==============================================================================#
# R script for:
# "Betting on the Underdog: The Influence of Social Networks on Vote Choice"
# Political Science Research Methods 
# 
# run as: Rscript main.R
#==============================================================================#

start = proc.time()
suppressMessages(library(rms)) # Cluster-robust standard errors 
suppressMessages(library(ggplot2)) # For plotting
suppressMessages(library(stargazer)) # Optionally, to format output

dir.create("figures")
dir.create("tables")

df = read.csv("main_dataset.csv")
df$cluster_id = df$primary_key
set.seed(2019)

#==============================================================================#
# Table 2. Cross-tabulation of the vote for the underdog, by experimental group
#==============================================================================#

# Breaking down in three groups
t2a = round(prop.table(table(df$player_treatment_group, df$player_decision),m=1)*100,2)
# Breaking down in four groups
t2b = round(prop.table(table(df$treatment_4, df$player_decision),m=1)*100,2)
# Overall vote breakdown
t2c = round(prop.table(table(df$player_decision))*100,2)
# Combining into a single table
t2 = rbind(t2a[c(1,3,2),], t2b[c(3,2),], t2c)
rownames(t2) = c("Control", "Random Network", "Homophilic Network", "  Low Signal", "  High Signal", "All")
colnames(t2) = c("Safe Option (S)", "Underdog (U)")

# Writing table to file
sink("tables/table2.txt")
cat("Table 2: Cross-tabulation of the vote for the underdog, by experimental group\n\n")
print(t2)
cat("\nObservations\t\t\t\t",nrow(df))
sink()

# Display table for log file: Table 2 in paper
t2

#==============================================================================#
# Difference between treatment groups (p-values reported in text)
#==============================================================================#

df_hc = df[df$player_treatment_group=='control' | df$player_treatment_group=='homophily', ]
df_rc = df[df$player_treatment_group=='control' | df$player_treatment_group=='random', ]

# Ordinary test of difference in proportions

## Comparison Control v. Homophily 
t1 = table(df_hc$homophily_binary, df_hc$player_decision)
cat("Difference in proportions Control v. Homophily (p-value):\n")
cat(round(prop.test(x = t1[,2], n = t1[,1]+t1[,2], correct=F)$p.value,3),"\n")
## Comparison Control v. Random
t2 = table(df_rc$random_binary, df_rc$player_decision)
cat("Difference in proportions Control v. Random (p-value):\n")
cat(round(prop.test(x = t2[,2], n = t2[,1]+t2[,2], correct=F)$p.value,3),"\n")

# Using cluster-robust t-tests
cat("Difference in proportions Control v. Homophily (cluter-robust p-value):\n")
m1 = ols(binary_choice~homophily_binary, x=T, y=T, data=df_hc)
bc1 = bootcov(m1, cluster=df_hc$cluster_id, B=1000, pr=F)
print(bc1)

cat("Difference in proportions Control v. Random (cluter-robust p-value):\n")
m2 = ols(binary_choice~random_binary, x=T, y=T, data=df_rc)
bc2 = bootcov(m2, cluster=df_rc$cluster_id, B=1000, pr=F)
print(bc2)

#==============================================================================#
# Table 3. Treatment effects 
#         (logistic regressions with cluster-robust standard errors)
#==============================================================================#
cat("Computing Model 1.\n")
m1 = lrm(binary_choice~treatment_4+round, x=T, y=T, data=df)
brse1 = bootcov(m1, cluster=df$cluster_id, B=1000, pr=F)

cat("Computing Model 2.\n")
m2 = lrm(binary_choice~treatment_4+private_signal+round, x=T, y=T, data=df)
brse2 = bootcov(m2, cluster=df$cluster_id, B=1000, pr=F)

cat("Computing Model 3.\n")
m3 = lrm(binary_choice~treatment_4+private_signal+risk+round, x=T, y=T, data=df)
brse3 = bootcov(m3, cluster=df$cluster_id, B=1000, pr=F)

covariate.labels = c("Homophilic Network (Low Signal)",
                     "Homophilic Network (High Signal)", 
                     "Random Network",
                     "Private Signal",
                     "Tolerance to Risk",
                     "Round",
                     "Constant")
ordering = c(2,1,3:7)

sink("tables/table3.txt")
cat("Table 3: Treatment effects (logistic regressions with cluster-robust standard errors)\n\n")
stargazer(brse1, brse2, brse3, 
          star.cutoffs = c(0.05,0.01,0.001), 
          covariate.labels=covariate.labels,
          dep.var.caption = "Vote Choice (Underdog=1)",
          dep.var.labels = "",
          column.labels = c("Model 1", "Model 2", "Model 3"),
          order=ordering,
          type="text")
sink()

# Print results to log file: Table 3 
stargazer(brse1, brse2, brse3, 
          star.cutoffs = c(0.05,0.01,0.001), 
          covariate.labels=covariate.labels,
          dep.var.caption = "Vote Choice (Underdog=1)",
          dep.var.labels = "",
          column.labels = c("Model 1", "Model 2", "Model 3"),
          order=ordering, type="text")



#==============================================================================#
# Table A1: Descriptive Statistics
#==============================================================================#

df_p = df[,c("primary_key",
             "player_treatment_group",
             "survey.1.player.age",                   
             "survey.1.player.gender",                   
             "survey.1.player.education",                
             "risk",
             "homophily_binary",
             "random_binary")]
df_p = df_p[!duplicated(df_p),] 

p1=data.frame(table(df_p$player_treatment_group))
p1$Var1 = c("Control", "Random Network", "Homophilic Network")
p2=data.frame(table(df_p$survey.1.player.age))
p3=data.frame(table(df_p$survey.1.player.gender))
p4=data.frame(table(df_p$survey.1.player.education))

tA1a = rbind(p1[c(1,3,2),],p2,p3,p4[c(3,4,2,1),])
tA1a$Variable = c("Experimental Group","","","Age","","","Gender","","Education","","","")
tA1a = tA1a[,c(3,1,2)]; names(tA1a) = c("Variable", "Category/Statistic", "Value")
tA1b = data.frame(c("Tolerance to Risk",""),
                  c("Mean", "Std. Deviation"), 
                  (c(round(mean(df_p$risk),2), round(sd(df_p$risk),2))))
names(tA1b) = c("Variable", "Category/Statistic", "Value")
tA1a$Value = as.character(tA1a$Value); tA1b$Value = as.character(tA1b$Value)
tA1=rbind(tA1a,tA1b)

sink("tables/tableA1.txt")
cat("Table A1: Descriptive Statistics\n\n")
print(tA1, row.names=F)
cat("Participants \t\t\t\t\t\t\t\t96\n")
sink()

# Print to log file: Table A1
tA1

#==============================================================================#
# Table A2: Balance Checks
#==============================================================================#

# At participant level
df_p$aged25 = ifelse(df_p$survey.1.player.age!='18-24 years old',1,0)
Af1 = homophily_binary~aged25+survey.1.player.gender+survey.1.player.education+risk
Af2 = random_binary~aged25+survey.1.player.gender+survey.1.player.education+risk

Am1 = glm(Af1, data=df_p, family=binomial(link="logit"))
Am2 = glm(Af2, data=df_p, family=binomial(link="logit"))

covariate.labels = c("Aged 25 and Above",
                     "Bachelor Degree",
                     "High School Degree",
                     "Some Higher Education",
                     "Gender = Male", 
                     "Tolerance to Risk",
                     "Constant")
ordering = c(1,4:5,3,2,6:7)

sink("tables/tableA2.txt")
cat("Table A2: Balance Checks\n\n")
stargazer(Am1,Am2,  
          star.cutoffs = c(0.05,0.01,0.001), 
          covariate.labels=covariate.labels,
          dep.var.caption = "Treatment Group",
          dep.var.labels = c("",""),
          column.labels = c("Homophilic Network", "Random Network"),
          order=ordering,
          type="text")
sink()

# Print results in table A2 to log file 
stargazer(Am1,Am2,  
          star.cutoffs = c(0.05,0.01,0.001), 
          covariate.labels=covariate.labels,
          dep.var.caption = "Treatment Group",
          dep.var.labels = c("",""),
          column.labels = c("Homophilic Network", "Random Network"),
          order=ordering,
          type="text")

#==============================================================================#
# Table A3: Cross-tabulation of the vote for the underdog, 
# by restricted subsamples
#==============================================================================#

# Restricted samples
df_low = df[df$private_signal<5 & (df$treatment_4=="Homophily_Low" | df$treatment_4=="Control"),]
df_high = df[df$private_signal>5 & (df$treatment_4=="Homophily_High" | df$treatment_4=="Control"),]

tA3a = round(prop.table(table(df_low$low_homophily, df_low$player_decision),m=1)*100,2)
tA3b = round(prop.table(table(df_high$high_homophily, df_high$player_decision),m=1)*100,2)
colnames(tA3a) = c("Safe Option (S)", "Underdog (U)")
rownames(tA3a) = c("Control", "Homophilic Treatment (Low Signal)")
colnames(tA3b) = c("Safe Option (S)", "Underdog (U)")
rownames(tA3b) = c("Control", "Homophilic Treatment (High Signal)")

# Writing table to file
sink("tables/tableA3.txt")
cat("Table A3: Cross-tabulation of the vote for the underdog, by restricted subsamples\n\n")
cat("Subsample: x < 5\n")
print(tA3a)
cat("\nObservations\t",nrow(df_low),"\n\n")
cat("Subsample: x > 5\n")
print(tA3b)
cat("\nObservations\t",nrow(df_high))
sink()

# Print table 3A to log file
cat("Table A3: Cross-tabulation of the vote for the underdog, by restricted subsamples\n\n")
cat("Subsample: x < 5\n")
print(tA3a)
cat("\nObservations\t",nrow(df_low),"\n\n")
cat("Subsample: x > 5\n")
print(tA3b)
cat("\nObservations\t",nrow(df_high))

#==============================================================================#
# Difference between treatment groups (p-values reported in appendix)
#==============================================================================#

# Ordinary test of difference in proportions

cat("Conditional difference in proportions Control v. Low Homophily (p-value):\n")
tA3a = table(df_low$low_homophily, df_low$player_decision)
cat(round(prop.test(x = tA3a[,2], n = tA3a[,1]+tA3a[,2], correct=F)$p.value,4), "\n\n")

cat("Conditional difference in proportions Control v. High Homophily (p-value):\n")
tA3b = table(df_high$high_homophily, df_high$player_decision)
cat(round(prop.test(x = tA3b[,2], n = tA3b[,1]+tA3b[,2], correct=F)$p.value,4), "\n\n")

cat("Treatment effect (high homophily):\n")
cat(round((prop.table(tA3b, m=1)[2,2] - prop.table(tA3b, m=1)[1,2])*100,1), "%\n\n")

# Using cluster-robust t-tests
cat("Conditional difference in proportions Control v. Low Homophily (cluster-robust p-value):\n")
m1 = ols(binary_choice~low_homophily, x=T, y=T, data=df_low)
bc1 = bootcov(m1, cluster=df_low$cluster_id, B=1000, pr=F)
print(bc1)

## Comparison Control v. High Signal Homophily (x > 5)
cat("Conditional difference in proportions Control v. High Homophily (cluster-robust p-value):\n")
m2 = ols(binary_choice~high_homophily, x=T, y=T, data=df_high)
bc2 = bootcov(m2, cluster=df_high$cluster_id, B=1000, pr=F)
print(bc2)

#==============================================================================#
# Table A4: Difference in Predicted Probabilities
#==============================================================================#

sims = data.frame(treatment_4 = c("Homophily_High", "Homophily_High","Homophily_High", 
                                  "Control", "Control","Control"),
                  private_signal = c(5,2.5,7.5,5,2.5,7.5), 
                  risk = c(5,5,5,5,5,5), 
                  round = c(5,5,5,5,5,5))
phat = round(predict(m3, newdata=sims, type="fitted"),3)
predicted_probs = data.frame(x_i = c(5,2.5,7.5), 
                             Homophily_High = c(phat[1], 
                                         phat[2],
                                         phat[3]),
                             Control = c(phat[4], 
                                         phat[5],
                                         phat[6]),
                             Difference = c(phat[1]-phat[4], 
                                         phat[2]-phat[5],
                                         phat[3]-phat[6]))

sink("tables/tableA4.txt")
cat("Table A4: Difference in Predicted Probabilities (Table 3, Model 3)\n\n")
print(predicted_probs, row.names=F)
sink()

# Print table A4 to log file
cat("Table A4: Difference in Predicted Probabilities (Table 3, Model 3)\n\n")
print(predicted_probs, row.names=F)

#==============================================================================#
# Figure A6: Vote for underdog by experimental group and round
#==============================================================================#

df$round_factor = '01-05'
df$round_factor[df$round<=10 & df$round>5] = '06-10'
df$round_factor[df$round<=15 & df$round>10] = '11-15'
df$round_factor[df$round<=20 & df$round>15] = '16-20'
figA6 = aggregate(df$binary_choice, 
                  list(round=df$round_factor, 
                  treatment=df$treatment_4),
                  FUN=mean ,na.rm=T)
names(figA6)[3]='prop'
cols = gray.colors(4, start=0, end=0.7, alpha = 0.7)

pdf("figures/byround.pdf",width=7,height=5,paper='special')
ggplot(data=figA6, aes(x=treatment, y=prop, fill=round)) + theme_bw() +
  geom_bar(stat="identity", position=position_dodge(), colour="black") +
  scale_fill_manual(values=cols) +
  guides(fill=guide_legend(title="Rounds")) +
  ylab("Proportion Voting for Underdog") + 
  xlab("Treatment Group")
dev.off()

#### ============ END =========== ###

cat("Results saved in the tables/ and figures/ directories.\n\n")
cat("Total computation time:\n")
print(proc.time() - start)
